# Importing the necessary libraries:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
from mlxtend.plotting import plot_decision_regions
df = pd.read_csv("../user_behavior_dataset.csv")
# df.isnull().sum() # There doesn't seem to be any null values in the dataset.
# df.isna().sum() # There doesn't seen to be any presence of na values in the dataset.
# df.head()
# df['Device Model'].nunique() #finding out the number of smartphone models:
phone_models = {'Google Pixel 5': 'tab:orange',
'OnePlus 9': 'tab:green',
'Xiaomi Mi 11': 'tab:blue',
'iPhone 12': 'tab:red',
'Samsung Galaxy S21': 'tab:purple'}
sns.pairplot(df, hue='Device Model', palette=phone_models, height=3)
categories = {'Age': 'tab:orange',
'Device Model': 'tab:green',
'Screen On Time (hours/day)': 'tab:blue',
'User ID': 'tab:purple',
'User Behavior Class': 'tab:red',
'App Usage Time (min/day)': 'brown',
'Data Usage (MB/day)': 'black',
'Battery Drain (mAh/day)': 'blue',
'Number of Apps Installed': 'tab:grey'}
plt.figure(figsize=[12,10])
sns.scatterplot(df[['User Behavior Class']], palette=categories, legend="auto")
bar_data = df.drop(columns=['Battery Drain (mAh/day)', 'Data Usage (MB/day)', 'User ID'], axis=1)
plt.figure(figsize=[18,8],edgecolor='black')
sns.barplot(bar_data)X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.3, stratify=Y)
print("The shape of both X and Y are: ", X_train.shape, Y_train.shape)## The shape of both X and Y are: (490, 8) (490,)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def KNN_loop():
error_rate = []
for k in range(1, 100):
model = KNeighborsClassifier(n_neighbors=k).fit(X_train, Y_train)
prediction = model.predict(X_test)
#print(prediction[0:5])
#print("The testing set accuracy is: ", metrics.accuracy_score(Y_test, prediction))
error_rate.append(np.mean(prediction != Y_test)) # Calculate error rate and append to list
return error_rate # Return error_rate so you can plot or analyze it later
error_rate = KNN_loop()plt.figure(figsize=[8,6])
plt.plot(range(1,100), error_rate, linestyle='--', marker='o', color='tab:red')
plt.grid()
plt.xlabel("K Value")
plt.ylabel("Error Rate")
plt.title("K Value Evaluation")
plt.legend()
plt.show()from sklearn.model_selection import cross_val_score
model = KNeighborsClassifier(n_neighbors=3).fit(X_train, Y_train)
prediction = model.predict(X_test)
metrics_report = metrics.classification_report(Y_test, prediction)
print(metrics_report)## precision recall f1-score support
##
## 1 1.00 1.00 1.00 41
## 2 1.00 1.00 1.00 44
## 3 1.00 1.00 1.00 43
## 4 1.00 1.00 1.00 41
## 5 1.00 1.00 1.00 41
##
## accuracy 1.00 210
## macro avg 1.00 1.00 1.00 210
## weighted avg 1.00 1.00 1.00 210
scores = cross_val_score(model, X_train, Y_train, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)## Cross-validation scores: [1. 1. 1. 1. 1.]
confusion_mat = metrics.confusion_matrix(Y_test, prediction)
plt.figure(figsize=[7,5])
sns.heatmap(confusion_mat, annot=True)
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()# Here, I needed to convert the X features to a numpy array:
X = df[['App Usage Time (min/day)', 'Screen On Time (hours/day)']].apply(pd.to_numeric, errors='coerce').dropna().to_numpy()
# Encode the target variable if it's not numeric
le = LabelEncoder()
Y = le.fit_transform(Y)
# Fit the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, Y)KNeighborsClassifier(n_neighbors=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=3)
# Plot decision regions using the unscaled (original) data
plt.figure(figsize=(10, 6))
plot_decision_regions(X, Y, clf=knn, legend=2)
plt.xlabel('App Usage Time (min/day)') # Ensure this reflects the correct label
plt.ylabel('Screen On Time (hours/day)')
plt.title('Decision Regions for User Behavior Classification')
plt.legend(loc='upper left', title='Behavior Class')
plt.show()